Enter the directory of the maca folder on your drive and the name of the tissue you want to analyze.
tissue_of_interest = "Spleen"
Load the requisite packages and some additional helper functions.
library(here)
library(useful)
library(Seurat)
library(dplyr)
library(Matrix)
library(ontologyIndex)
cell_ontology = get_ontology('https://raw.githubusercontent.com/obophenotype/cell-ontology/master/cl-basic.obo', extract_tags='everything')
validate_cell_ontology = function(cell_ontology_class){
in_cell_ontology = sapply(cell_ontology_class, function(x) is.element(x, cell_ontology$name) || is.na(x))
if (!all(in_cell_ontology)) {
message = paste0('"', cell_ontology_class[!in_cell_ontology], '" is not in the cell ontology
')
stop(message)
}
}
convert_to_cell_ontology_id = function(cell_ontology_class){
return(sapply(cell_ontology_class, function(x) as.vector(cell_ontology$id[cell_ontology$name == x])[1]))
}
save_dir = here('00_data_ingest', 'tissue_robj')
# read the metadata to get the plates we want
plate_metadata_filename = here('00_data_ingest', '00_facs_raw_data', 'metadata_FACS.csv')
plate_metadata <- read.csv(plate_metadata_filename, sep=",", header = TRUE)
colnames(plate_metadata)[1] <- "plate.barcode"
plate_metadata
Subset the metadata on the tissue.
tissue_plates = filter(plate_metadata, tissue == tissue_of_interest)[,c('plate.barcode','tissue','subtissue','mouse.sex')]
tissue_plates
Load the read count data.
#Load the gene names and set the metadata columns by opening the first file
filename = here('00_data_ingest', '00_facs_raw_data', 'FACS', paste0(tissue_of_interest, '-counts.csv'))
raw.data = read.csv(filename, sep=",", row.names=1)
# raw.data = data.frame(row.names = rownames(raw.data))
corner(raw.data)
Make a vector of plate barcodes for each cell
plate.barcodes = lapply(colnames(raw.data), function(x) strsplit(strsplit(x, "_")[[1]][1], '.', fixed=TRUE)[[1]][2])
head(plate.barcodes)
[[1]]
[1] "MAA000508"
[[2]]
[1] "MAA000508"
[[3]]
[1] "MAA000508"
[[4]]
[1] "MAA000508"
[[5]]
[1] "MAA000508"
[[6]]
[1] "MAA000508"
Use only the metadata rows corresponding to Bladder plates. Make a plate barcode dataframe to “expand” the per-plate metadata to be per-cell.
barcode.df = t.data.frame(as.data.frame(plate.barcodes))
rownames(barcode.df) = colnames(raw.data)
colnames(barcode.df) = c('plate.barcode')
head(barcode.df)
plate.barcode
A21.MAA000508.3_9_M.1.1 "MAA000508"
C6.MAA000508.3_9_M.1.1 "MAA000508"
A22.MAA000508.3_9_M.1.1 "MAA000508"
C8.MAA000508.3_9_M.1.1 "MAA000508"
E8.MAA000508.3_9_M.1.1 "MAA000508"
E6.MAA000508.3_9_M.1.1 "MAA000508"
rnames = row.names(barcode.df)
meta.data <- merge(barcode.df, plate_metadata, by='plate.barcode', sort = F)
row.names(meta.data) <- rnames
# Sort cells by plate barcode because that's how the data was originally
meta.data = meta.data[order(meta.data$plate.barcode), ]
corner(meta.data)
raw.data = raw.data[, rownames(meta.data)]
corner(raw.data)
Process the raw data and load it into the Seurat object.
# Find ERCC's, compute the percent ERCC, and drop them from the raw data.
erccs <- grep(pattern = "^ERCC-", x = rownames(x = raw.data), value = TRUE)
percent.ercc <- Matrix::colSums(raw.data[erccs, ])/Matrix::colSums(raw.data)
ercc.index <- grep(pattern = "^ERCC-", x = rownames(x = raw.data), value = FALSE)
raw.data <- raw.data[-ercc.index,]
# Create the Seurat object with all the data
tiss <- CreateSeuratObject(raw.data = raw.data, project = tissue_of_interest,
min.cells = 5, min.genes = 5)
tiss <- AddMetaData(object = tiss, meta.data)
tiss <- AddMetaData(object = tiss, percent.ercc, col.name = "percent.ercc")
# Change default name for sums of counts from nUMI to nReads
colnames(tiss@meta.data)[colnames(tiss@meta.data) == 'nUMI'] <- 'nReads'
# Create metadata columns for cell_ontology_classs and subcell_ontology_classs
tiss@meta.data[,'free_annotation'] <- NA
tiss@meta.data[,'cell_ontology_class'] <- NA
tiss@meta.data[,'subcell_ontology_class'] <- NA
Calculate percent ribosomal genes.
ribo.genes <- grep(pattern = "^Rp[sl][[:digit:]]", x = rownames(x = tiss@data), value = TRUE)
percent.ribo <- Matrix::colSums(tiss@raw.data[ribo.genes, ])/Matrix::colSums(tiss@raw.data)
tiss <- AddMetaData(object = tiss, metadata = percent.ribo, col.name = "percent.ribo")
A sanity check: genes per cell vs reads per cell.
GenePlot(object = tiss, gene1 = "nReads", gene2 = "nGene", use.raw=T)
Filter out cells with few reads and few genes.
tiss <- FilterCells(object = tiss, subset.names = c("nGene", "nReads"),
low.thresholds = c(500, 50000), high.thresholds = c(25000, 2000000))
Normalize the data, then regress out correlation with total reads
tiss <- NormalizeData(object = tiss, scale.factor = 1e6)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
tiss <- ScaleData(object = tiss)
[1] "Scaling data matrix"
|
| | 0%
|
|===========================================================================================================================| 100%
tiss <- FindVariableGenes(object = tiss, do.plot = TRUE, x.low.cutoff = 0.7 , x.high.cutoff = Inf, y.cutoff = 0.4)
Calculating gene means
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variance to mean ratios
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Run Principal Component Analysis.
tiss <- RunPCA(object = tiss, do.print = FALSE)
tiss <- ProjectPCA(object = tiss, do.print = FALSE)
Later on (in FindClusters and TSNE) you will pick a number of principal components to use. This has the effect of keeping the major directions of variation in the data and, ideally, supressing noise. There is no correct answer to the number to use, but a decent rule of thumb is to go until the plot plateaus.
PCElbowPlot(object = tiss)
Choose the number of principal components to use.
# Set number of principal components.
n.pcs = 7
The clustering is performed based on a nearest neighbors graph. Cells that have similar expression will be joined together. The Louvain algorithm looks for groups of cells with high modularity–more connections within the group than between groups. The resolution parameter determines the scale…higher resolution will give more clusters, lower resolution will give fewer.
For the top-level clustering, aim to under-cluster instead of over-cluster. It will be easy to subset groups and further analyze them below.
# Set resolution
res.used <- 2.5
tiss <- FindClusters(object = tiss, reduction.type = "pca", dims.use = 1:n.pcs,
resolution = res.used, print.output = 0, save.SNN = TRUE)
To visualize
# If cells are too spread out, you can raise the perplexity. If you have few cells, try a lower perplexity (but never less than 10).
tiss <- RunTSNE(object = tiss, dims.use = 1:n.pcs, seed.use = 10, perplexity=20)
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = tiss, do.label = T)
Check expression of genes of interset.
Dotplots let you see the intensity of exppression and the fraction of cells expressing for each of your genes of interest.
How big are the clusters?
table(tiss@ident)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
147 144 141 136 135 129 128 125 117 113 104 77 58 50 49 36
Which markers identify a specific cluster?
clust.markers <- FindMarkers(object = tiss, ident.1 = 3, ident.2 = 1, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
| | 0 % ~calculating
|+ | 1 % ~09s
|++ | 2 % ~08s
|++ | 3 % ~08s
|+++ | 4 % ~08s
|+++ | 5 % ~08s
|++++ | 6 % ~08s
|++++ | 7 % ~08s
|+++++ | 8 % ~08s
|+++++ | 9 % ~08s
|++++++ | 10% ~08s
|++++++ | 11% ~08s
|+++++++ | 12% ~07s
|+++++++ | 13% ~07s
|++++++++ | 14% ~07s
|++++++++ | 15% ~07s
|+++++++++ | 16% ~07s
|+++++++++ | 17% ~07s
|++++++++++ | 18% ~07s
|++++++++++ | 19% ~07s
|+++++++++++ | 20% ~07s
|+++++++++++ | 21% ~07s
|++++++++++++ | 22% ~07s
|++++++++++++ | 23% ~07s
|+++++++++++++ | 24% ~06s
|+++++++++++++ | 26% ~06s
|++++++++++++++ | 27% ~06s
|++++++++++++++ | 28% ~06s
|+++++++++++++++ | 29% ~06s
|+++++++++++++++ | 30% ~06s
|++++++++++++++++ | 31% ~06s
|++++++++++++++++ | 32% ~06s
|+++++++++++++++++ | 33% ~05s
|+++++++++++++++++ | 34% ~05s
|++++++++++++++++++ | 35% ~05s
|++++++++++++++++++ | 36% ~05s
|+++++++++++++++++++ | 37% ~05s
|+++++++++++++++++++ | 38% ~05s
|++++++++++++++++++++ | 39% ~05s
|++++++++++++++++++++ | 40% ~05s
|+++++++++++++++++++++ | 41% ~05s
|+++++++++++++++++++++ | 42% ~05s
|++++++++++++++++++++++ | 43% ~04s
|++++++++++++++++++++++ | 44% ~04s
|+++++++++++++++++++++++ | 45% ~04s
|+++++++++++++++++++++++ | 46% ~04s
|++++++++++++++++++++++++ | 47% ~04s
|++++++++++++++++++++++++ | 48% ~04s
|+++++++++++++++++++++++++ | 49% ~04s
|+++++++++++++++++++++++++ | 50% ~04s
|++++++++++++++++++++++++++ | 51% ~04s
|+++++++++++++++++++++++++++ | 52% ~04s
|+++++++++++++++++++++++++++ | 53% ~04s
|++++++++++++++++++++++++++++ | 54% ~03s
|++++++++++++++++++++++++++++ | 55% ~03s
|+++++++++++++++++++++++++++++ | 56% ~03s
|+++++++++++++++++++++++++++++ | 57% ~03s
|++++++++++++++++++++++++++++++ | 58% ~03s
|++++++++++++++++++++++++++++++ | 59% ~03s
|+++++++++++++++++++++++++++++++ | 60% ~03s
|+++++++++++++++++++++++++++++++ | 61% ~03s
|++++++++++++++++++++++++++++++++ | 62% ~03s
|++++++++++++++++++++++++++++++++ | 63% ~03s
|+++++++++++++++++++++++++++++++++ | 64% ~03s
|+++++++++++++++++++++++++++++++++ | 65% ~03s
|++++++++++++++++++++++++++++++++++ | 66% ~03s
|++++++++++++++++++++++++++++++++++ | 67% ~02s
|+++++++++++++++++++++++++++++++++++ | 68% ~02s
|+++++++++++++++++++++++++++++++++++ | 69% ~02s
|++++++++++++++++++++++++++++++++++++ | 70% ~02s
|++++++++++++++++++++++++++++++++++++ | 71% ~02s
|+++++++++++++++++++++++++++++++++++++ | 72% ~02s
|+++++++++++++++++++++++++++++++++++++ | 73% ~02s
|++++++++++++++++++++++++++++++++++++++ | 74% ~02s
|++++++++++++++++++++++++++++++++++++++ | 76% ~02s
|+++++++++++++++++++++++++++++++++++++++ | 77% ~02s
|+++++++++++++++++++++++++++++++++++++++ | 78% ~02s
|++++++++++++++++++++++++++++++++++++++++ | 79% ~02s
|++++++++++++++++++++++++++++++++++++++++ | 80% ~02s
|+++++++++++++++++++++++++++++++++++++++++ | 81% ~01s
|+++++++++++++++++++++++++++++++++++++++++ | 82% ~01s
|++++++++++++++++++++++++++++++++++++++++++ | 83% ~01s
|++++++++++++++++++++++++++++++++++++++++++ | 84% ~01s
|+++++++++++++++++++++++++++++++++++++++++++ | 85% ~01s
|+++++++++++++++++++++++++++++++++++++++++++ | 86% ~01s
|++++++++++++++++++++++++++++++++++++++++++++ | 87% ~01s
|++++++++++++++++++++++++++++++++++++++++++++ | 88% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++ | 89% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++ | 90% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++ | 91% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++ | 92% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 93% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 94% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 95% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 96% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 07s
print(x = head(x= clust.markers, n = 20, avg_diff))
You can also compute all markers for all clusters at once. This may take some time.
tiss.markers <- FindAllMarkers(object = tiss, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
| | 0 % ~calculating
|+ | 1 % ~01m 20s
|++ | 2 % ~01m 17s
|++ | 3 % ~01m 15s
|+++ | 4 % ~01m 15s
|+++ | 5 % ~01m 14s
|++++ | 6 % ~01m 14s
|++++ | 7 % ~01m 13s
|+++++ | 8 % ~01m 14s
|+++++ | 9 % ~01m 13s
|++++++ | 10% ~01m 12s
|++++++ | 11% ~01m 11s
|+++++++ | 12% ~01m 10s
|+++++++ | 13% ~01m 09s
|++++++++ | 14% ~01m 09s
|++++++++ | 15% ~01m 08s
|+++++++++ | 16% ~01m 08s
|+++++++++ | 17% ~01m 07s
|++++++++++ | 18% ~01m 06s
|++++++++++ | 19% ~01m 05s
|+++++++++++ | 20% ~01m 04s
|+++++++++++ | 21% ~01m 03s
|++++++++++++ | 22% ~01m 02s
|++++++++++++ | 23% ~01m 01s
|+++++++++++++ | 24% ~01m 01s
|+++++++++++++ | 26% ~01m 00s
|++++++++++++++ | 27% ~60s
|++++++++++++++ | 28% ~59s
|+++++++++++++++ | 29% ~58s
|+++++++++++++++ | 30% ~57s
|++++++++++++++++ | 31% ~56s
|++++++++++++++++ | 32% ~55s
|+++++++++++++++++ | 33% ~54s
|+++++++++++++++++ | 34% ~53s
|++++++++++++++++++ | 35% ~52s
|++++++++++++++++++ | 36% ~51s
|+++++++++++++++++++ | 37% ~50s
|+++++++++++++++++++ | 38% ~50s
|++++++++++++++++++++ | 39% ~49s
|++++++++++++++++++++ | 40% ~48s
|+++++++++++++++++++++ | 41% ~47s
|+++++++++++++++++++++ | 42% ~47s
|++++++++++++++++++++++ | 43% ~46s
|++++++++++++++++++++++ | 44% ~45s
|+++++++++++++++++++++++ | 45% ~44s
|+++++++++++++++++++++++ | 46% ~44s
|++++++++++++++++++++++++ | 47% ~43s
|++++++++++++++++++++++++ | 48% ~42s
|+++++++++++++++++++++++++ | 49% ~41s
|+++++++++++++++++++++++++ | 50% ~41s
|++++++++++++++++++++++++++ | 51% ~40s
|+++++++++++++++++++++++++++ | 52% ~39s
|+++++++++++++++++++++++++++ | 53% ~38s
|++++++++++++++++++++++++++++ | 54% ~37s
|++++++++++++++++++++++++++++ | 55% ~37s
|+++++++++++++++++++++++++++++ | 56% ~36s
|+++++++++++++++++++++++++++++ | 57% ~35s
|++++++++++++++++++++++++++++++ | 58% ~34s
|++++++++++++++++++++++++++++++ | 59% ~33s
|+++++++++++++++++++++++++++++++ | 60% ~33s
|+++++++++++++++++++++++++++++++ | 61% ~32s
|++++++++++++++++++++++++++++++++ | 62% ~31s
|++++++++++++++++++++++++++++++++ | 63% ~30s
|+++++++++++++++++++++++++++++++++ | 64% ~29s
|+++++++++++++++++++++++++++++++++ | 65% ~28s
|++++++++++++++++++++++++++++++++++ | 66% ~28s
|++++++++++++++++++++++++++++++++++ | 67% ~27s
|+++++++++++++++++++++++++++++++++++ | 68% ~26s
|+++++++++++++++++++++++++++++++++++ | 69% ~25s
|++++++++++++++++++++++++++++++++++++ | 70% ~24s
|++++++++++++++++++++++++++++++++++++ | 71% ~23s
|+++++++++++++++++++++++++++++++++++++ | 72% ~22s
|+++++++++++++++++++++++++++++++++++++ | 73% ~22s
|++++++++++++++++++++++++++++++++++++++ | 74% ~21s
|++++++++++++++++++++++++++++++++++++++ | 76% ~20s
|+++++++++++++++++++++++++++++++++++++++ | 77% ~19s
|+++++++++++++++++++++++++++++++++++++++ | 78% ~18s
|++++++++++++++++++++++++++++++++++++++++ | 79% ~17s
|++++++++++++++++++++++++++++++++++++++++ | 80% ~17s
|+++++++++++++++++++++++++++++++++++++++++ | 81% ~16s
|+++++++++++++++++++++++++++++++++++++++++ | 82% ~15s
|++++++++++++++++++++++++++++++++++++++++++ | 83% ~14s
|++++++++++++++++++++++++++++++++++++++++++ | 84% ~13s
|+++++++++++++++++++++++++++++++++++++++++++ | 85% ~12s
|+++++++++++++++++++++++++++++++++++++++++++ | 86% ~12s
|++++++++++++++++++++++++++++++++++++++++++++ | 87% ~11s
|++++++++++++++++++++++++++++++++++++++++++++ | 88% ~10s
|+++++++++++++++++++++++++++++++++++++++++++++ | 89% ~09s
|+++++++++++++++++++++++++++++++++++++++++++++ | 90% ~08s
|++++++++++++++++++++++++++++++++++++++++++++++ | 91% ~07s
|++++++++++++++++++++++++++++++++++++++++++++++ | 92% ~07s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 93% ~06s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 94% ~05s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 95% ~04s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 96% ~03s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~02s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~02s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 01m 21s
| | 0 % ~calculating
|+ | 1 % ~33s
|++ | 2 % ~31s
|++ | 3 % ~30s
|+++ | 4 % ~29s
|+++ | 5 % ~28s
|++++ | 6 % ~28s
|++++ | 7 % ~28s
|+++++ | 8 % ~28s
|+++++ | 9 % ~28s
|++++++ | 10% ~28s
|++++++ | 11% ~29s
|+++++++ | 12% ~31s
|+++++++ | 13% ~30s
|++++++++ | 14% ~30s
|++++++++ | 15% ~30s
|+++++++++ | 16% ~29s
|+++++++++ | 17% ~29s
Display the top markers you computed above.
At a coarse level, we can use canonical markers to match the unbiased clustering to known cell types:
Color by metadata, like plate barcode, to check for batch effects.
Print a table showing the count of cells in each identity category from each plate.
When you save the annotated tissue, please give it a name.
So that Biohub can easily combine all your cell_ontology_classs, please export them as a simple csv.